import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, Optional
from src.models.denoiser.mmdit import RMSNorm, attention
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from functools import partial
from typing import Callable
from timm.models.layers import DropPath
from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
from einops import rearrange, repeat
import numpy as np


NEG_INF = -1000000

class SS2D(nn.Module):
    def __init__(
            self,
            d_model,
            d_state=16,
            d_conv=3,
            expand=2.,
            dt_rank="auto",
            dt_min=0.001,
            dt_max=0.1,
            dt_init="random",
            dt_scale=1.0,
            dt_init_floor=1e-4,
            dropout=0.,
            conv_bias=True,
            bias=False,
            device=None,
            dtype=None,
            is_cross=False,
            args=None,
            bc_norm=None,
            **kwargs,
    ):
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        self.args=args
        self.d_model = d_model
        self.d_state = d_state
        self.d_conv = d_conv
        self.expand = expand
        self.d_inner = int(self.expand * self.d_model)
        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank

        # self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
        # self.in_style_proj = nn.Linear(self.d_model, self.d_inner, bias=bias, **factory_kwargs)
        self.conv2d = nn.Conv2d(
            in_channels=self.d_inner,
            out_channels=self.d_inner,
            groups=self.d_inner,
            bias=conv_bias,
            kernel_size=d_conv,
            
            padding=(d_conv - 1) // 2,
            **factory_kwargs,
        )
        
        self.conv2d_2 = nn.Conv2d(
            in_channels=self.d_inner,
            out_channels=self.d_inner,
            groups=self.d_inner,
            bias=conv_bias,
            kernel_size=d_conv,
            stride=2,
            padding=(d_conv - 1) // 2,
            **factory_kwargs,
        )
        self.act = nn.SiLU()

        if is_cross:
            self.style_proj = (
                    nn.Linear(self.d_inner, (self.dt_rank + self.d_state), bias=False, **factory_kwargs),
                    nn.Linear(self.d_inner, (self.dt_rank + self.d_state), bias=False, **factory_kwargs),
                    nn.Linear(self.d_inner, (self.dt_rank + self.d_state), bias=False, **factory_kwargs),
                    nn.Linear(self.d_inner, (self.dt_rank + self.d_state), bias=False, **factory_kwargs),
                )
            self.x_proj = (
                    nn.Linear(self.d_inner, self.d_state, bias=False, **factory_kwargs),
                    nn.Linear(self.d_inner, self.d_state, bias=False, **factory_kwargs),
                    nn.Linear(self.d_inner, self.d_state, bias=False, **factory_kwargs),
                    nn.Linear(self.d_inner, self.d_state, bias=False, **factory_kwargs),
                )
            self.style_proj_weight = nn.Parameter(torch.stack([t.weight for t in self.style_proj], dim=0))  # (K=4, N, inner)
            del self.style_proj
        else:
            self.x_proj = (
                nn.Linear(self.d_inner, (self.dt_rank + self.d_state * 2), bias=False, **factory_kwargs),
                nn.Linear(self.d_inner, (self.dt_rank + self.d_state * 2), bias=False, **factory_kwargs),
                nn.Linear(self.d_inner, (self.dt_rank + self.d_state * 2), bias=False, **factory_kwargs),
                nn.Linear(self.d_inner, (self.dt_rank + self.d_state * 2), bias=False, **factory_kwargs),
            )
        
        self.x_proj_weight = nn.Parameter(torch.stack([t.weight for t in self.x_proj], dim=0))  # (K=4, N, inner)
        del self.x_proj

        self.dt_projs = (
            self.dt_init(self.dt_rank, self.d_inner, dt_scale, dt_init, dt_min, dt_max, dt_init_floor,
                         **factory_kwargs),
            self.dt_init(self.dt_rank, self.d_inner, dt_scale, dt_init, dt_min, dt_max, dt_init_floor,
                         **factory_kwargs),
            self.dt_init(self.dt_rank, self.d_inner, dt_scale, dt_init, dt_min, dt_max, dt_init_floor,
                         **factory_kwargs),
            self.dt_init(self.dt_rank, self.d_inner, dt_scale, dt_init, dt_min, dt_max, dt_init_floor,
                         **factory_kwargs),
        )
        self.dt_projs_weight = nn.Parameter(torch.stack([t.weight for t in self.dt_projs], dim=0))  # (K=4, inner, rank)
        self.dt_projs_bias = nn.Parameter(torch.stack([t.bias for t in self.dt_projs], dim=0))  # (K=4, inner)
        del self.dt_projs

        self.A_logs = self.A_log_init(self.d_state, self.d_inner, copies=4, merge=True)  # (K=4, D, N)
        self.Ds = self.D_init(self.d_inner, copies=4, merge=True)  # (K=4, D, N)

        self.selective_scan = selective_scan_fn

        self.out_norm = nn.LayerNorm(self.d_inner)
        # self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
        # self.dropout = nn.Dropout(dropout) if dropout > 0. else None
        
        if bc_norm == "rms":
            self.ln_b = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
            self.ln_c = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
        elif bc_norm == "ln":
            self.ln_b = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
            self.ln_c = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
        elif bc_norm is None:
            self.ln_b = nn.Identity()
            self.ln_c = nn.Identity()
        else:
            raise ValueError(bc_norm)

    @staticmethod
    def dt_init(dt_rank, d_inner, dt_scale=1.0, dt_init="random", dt_min=0.001, dt_max=0.1, dt_init_floor=1e-4,
                **factory_kwargs):
        dt_proj = nn.Linear(dt_rank, d_inner, bias=True, **factory_kwargs)

        # Initialize special dt projection to preserve variance at initialization
        dt_init_std = dt_rank ** -0.5 * dt_scale
        if dt_init == "constant":
            nn.init.constant_(dt_proj.weight, dt_init_std)
        elif dt_init == "random":
            nn.init.uniform_(dt_proj.weight, -dt_init_std, dt_init_std)
        else:
            raise NotImplementedError

        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
        dt = torch.exp(
            torch.rand(d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
            + math.log(dt_min)
        ).clamp(min=dt_init_floor)
        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
        inv_dt = dt + torch.log(-torch.expm1(-dt))
        with torch.no_grad():
            dt_proj.bias.copy_(inv_dt)
        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
        dt_proj.bias._no_reinit = True

        return dt_proj

    @staticmethod
    def A_log_init(d_state, d_inner, copies=1, device=None, merge=True):
        # S4D real initialization
        A = repeat(
            torch.arange(1, d_state + 1, dtype=torch.float32, device=device),
            "n -> d n",
            d=d_inner,
        ).contiguous()
        A_log = torch.log(A)  # Keep A_log in fp32
        if copies > 1:
            A_log = repeat(A_log, "d n -> r d n", r=copies)
            if merge:
                A_log = A_log.flatten(0, 1)
        A_log = nn.Parameter(A_log)
        A_log._no_weight_decay = True
        return A_log

    @staticmethod
    def D_init(d_inner, copies=1, device=None, merge=True):
        # D "skip" parameter
        D = torch.ones(d_inner, device=device)
        if copies > 1:
            D = repeat(D, "n1 -> r n1", r=copies)
            if merge:
                D = D.flatten(0, 1)
        D = nn.Parameter(D)  # Keep in fp32
        D._no_weight_decay = True
        return D
    
    def forward_core(self, x: torch.Tensor, style=None):
        B, C, H, W = x.shape
        L = H * W
        K = 4
        x_hwwh = torch.stack([x.view(B, -1, L), torch.transpose(x, dim0=2, dim1=3).contiguous().view(B, -1, L)], dim=1).view(B, 2, -1, L)
        xs = torch.cat([x_hwwh, torch.flip(x_hwwh, dims=[-1])], dim=1) # (1, 4, 192, 3136)

        x_dbl = torch.einsum("b k d l, k c d -> b k c l", xs.view(B, K, -1, L), self.x_proj_weight)
        
        if style is not None:
            style = torch.stack([style.view(B, -1, L), torch.transpose(style, dim0=2, dim1=3).contiguous().view(B, -1, L)], dim=1).view(B, 2, -1, L)
            style = torch.cat([style, torch.flip(style, dims=[-1])], dim=1) # (1, 4, 192, 3136)
            s_dbl = torch.einsum("b k d l, k c d -> b k c l", style.view(B, K, -1, L), self.style_proj_weight)

            dts, Bs = torch.split(s_dbl, [self.dt_rank, self.d_state], dim=2)
            Cs = x_dbl
                
        else:
            dts, Bs, Cs = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=2)
        
        
        dts = torch.einsum("b k r l, k d r -> b k d l", dts.view(B, K, -1, L), self.dt_projs_weight)
        xs = xs.view(B, -1, L)
        dts = dts.contiguous().view(B, -1, L) # (b, k * d, l)
        Bs = Bs.view(B, K, -1, L)
        Cs = Cs.view(B, K, -1, L) # (b, k, d_state, l)
        Cs = self.ln_c(Cs)
        # .reshape(q.shape[0], q.shape[1], -1)
        Bs = self.ln_b(Bs)
        # .reshape(q.shape[0], q.shape[1], -1)
        Ds = self.Ds.float().view(-1)
        As = -torch.exp(self.A_logs.float()).view(-1, self.d_state)
        dt_projs_bias = self.dt_projs_bias.float().view(-1) # (k * d)
        
        if style is not None:
            style = style.view(B, -1, L)
            #change hereeeeeeee xs and style
            out_y = self.selective_scan(
            style, dts,
            As, Bs, Cs, Ds, z=None,
            delta_bias=dt_projs_bias,
            delta_softplus=True,
            return_last_state=False,
        ).view(B, K, -1, L)
        else:
            out_y = self.selective_scan(
            xs, dts,
            As, Bs, Cs, Ds, z=None,
            delta_bias=dt_projs_bias,
            delta_softplus=True,
            return_last_state=False,
        ).view(B, K, -1, L)

        inv_y = torch.flip(out_y[:, 2:4], dims=[-1]).view(B, 2, -1, L)
        wh_y = torch.transpose(out_y[:, 1].view(B, -1, W, H), dim0=2, dim1=3).contiguous().view(B, -1, L)
        invwh_y = torch.transpose(inv_y[:, 1].view(B, -1, W, H), dim0=2, dim1=3).contiguous().view(B, -1, L)

        return out_y[:, 0], inv_y[:, 0], wh_y, invwh_y

    def forward(self, x: torch.Tensor, style=None, **kwargs):
        B, H, W, C = x.shape

        # xz = self.in_proj(x)
        
        # x, z = xz.chunk(2, dim=-1)
        # if style is not None:
        #     style = self.in_style_proj(style)
            
        x = x.permute(0, 3, 1, 2).contiguous()
        x = self.act(self.conv2d(x))
        
        if style is not None:
            style = style.permute(0, 3, 1, 2).contiguous()
            style = self.act(self.conv2d_2(style))
            y1, y2, y3, y4 = self.forward_core(x, style=style)
        else:
            y1, y2, y3, y4 = self.forward_core(x)
            
        y = y1 + y2 + y3 + y4
        y = torch.transpose(y, dim0=1, dim1=2).contiguous().view(B, H*W, -1)
        y = self.out_norm(y)
        # y = y * F.silu(z)
        # out = self.out_proj(y)
        # if self.dropout is not None:
        #     out = self.dropout(out)
        return y
    
class CrossAttention(nn.Module):
    ATTENTION_MODES = ("xformers", "torch", "torch-hb", "math", "debug")

    def __init__(
        self,
        dim: int,
        cross_dim: Optional[int] = None,  # 新增参数，支持不同维度的上下文
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_scale: Optional[float] = None,
        attn_mode: str = "xformers",
        pre_only: bool = False,
        qk_norm: Optional[str] = None,
        rmsnorm: bool = False,
        dtype=None,
        device=None,
    ):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        cross_dim = cross_dim if cross_dim is not None else dim  # 默认与 dim 相同

        # 拆分 q 和 kv 的线性层
        self.q = nn.Linear(dim, dim, bias=qkv_bias, dtype=dtype, device=device)
        self.kv = nn.Linear(cross_dim, dim * 2, bias=qkv_bias, dtype=dtype, device=device)
        
        if not pre_only:
            self.proj = nn.Linear(dim, dim, dtype=dtype, device=device)
        assert attn_mode in self.ATTENTION_MODES
        self.attn_mode = attn_mode
        self.pre_only = pre_only

        # 归一化层保持不变
        if qk_norm == "rms":
            self.ln_q = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
            self.ln_k = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
        elif qk_norm == "ln":
            self.ln_q = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
            self.ln_k = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
        elif qk_norm is None:
            self.ln_q = nn.Identity()
            self.ln_k = nn.Identity()
        else:
            raise ValueError(qk_norm)

    def pre_attention(self, x: torch.Tensor, context: torch.Tensor):
        B, L, _ = x.shape
        # 生成 q 来自 x
        q = self.q(x)  # [B, L, dim]
        # 生成 k, v 来自 context
        kv = self.kv(context)  # [B, L_context, dim*2]
        k, v = kv.chunk(2, dim=-1)  # 各 [B, L_context, dim]
        
        # 调整形状为 [B, seq_len, num_heads, head_dim]
        q = q.reshape(B, -1, self.num_heads, self.head_dim)
        k = k.reshape(B, -1, self.num_heads, self.head_dim)
        v = v.reshape(B, -1, self.num_heads, self.head_dim)
        
        # 应用归一化（对 head_dim 维度）
        q = self.ln_q(q).reshape(B, -1, self.num_heads * self.head_dim)
        k = self.ln_k(k).reshape(B, -1, self.num_heads * self.head_dim)
        return q, k, v

    def post_attention(self, x: torch.Tensor) -> torch.Tensor:
        if self.pre_only:
            return x
        return self.proj(x)

    def forward(self, x: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
        q, k, v = self.pre_attention(x, context)
        x = attention(q, k, v, self.num_heads)  # 假设 attention 函数支持交叉注意力
        x = self.post_attention(x)
        return x
    
class IPAttnProcessor(torch.nn.Module):
    r"""
    Attention processor for IP-Adapater for PyTorch 2.0.
    Args:
        hidden_size (`int`):
            The hidden size of the attention layer.
        cross_attention_dim (`int`):
            The number of channels in the `encoder_hidden_states`.
        scale (`float`, defaults to 1.0):
            the weight scale of image prompt.
        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
            The context length of the image features.
    """

    def __init__(self, hidden_size, scale=1.0,dropout=0.0, out_bias=True):
        super().__init__()

        if not hasattr(F, "scaled_dot_product_attention"):
            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")

        self.hidden_size = hidden_size
        self.scale = scale
        self.attn_t = CrossAttention(hidden_size,num_heads=8,qkv_bias=True,attn_mode="xformers",pre_only=False)
        attn_drop_rate: float = 0
        d_state: int = 16
        expand: float = 1.
        is_cross=True
        # bias=None
        # self.d_inner = int(self.expand * self.hidden_size)
        self.attn_z = SS2D(d_model=hidden_size, d_state=d_state,expand=expand,dropout=attn_drop_rate, is_cross=is_cross)
        # self.proj = nn.Linear(dim, dim)
        # self.in_proj_x = nn.Linear(self.d_model, self.d_inner, bias=bias)
        # self.in_proj_h = nn.Linear(self.d_model, self.d_inner, bias=bias)
        self.to_out = nn.ModuleList([])
        self.to_out.append(nn.Linear(self.hidden_size, self.hidden_size, bias=out_bias))
        self.to_out.append(nn.Dropout(dropout))

    def __call__(
        self,x, ref_img, context
    ):
        residual = x

        hidden_states_t = self.attn_t(ref_img[:,1:], context)

        B, L, C = hidden_states_t.shape
        Bx, Lx, Cx = x.shape
        hidden_states_t = hidden_states_t.view(B, int(np.sqrt(L)), int(np.sqrt(L)), C).contiguous()
        x_tmp = x.view(Bx, int(np.sqrt(Lx)), int(np.sqrt(Lx)), Cx).contiguous()  # [B,H,W,C]
        hidden_states_z = self.attn_z(x_tmp, hidden_states_t)
        
        hidden_states = x + self.scale * ( hidden_states_z)
        # linear proj
        hidden_states = self.to_out[0](hidden_states)
        # dropout
        hidden_states = self.to_out[1](hidden_states)
        
        hidden_states = hidden_states + residual

        return hidden_states

class MipAdapter(nn.Module):
    def __init__(self, hidden_size, c_dim=768, projector_dim=2048,scale=0.5,mip_depth=6,start_rag=8, end_rag=13):
        super(MipAdapter, self).__init__()
        self.start_rag=start_rag
        self.end_rag=end_rag
        self.attention_process = nn.ModuleList(
            [
                IPAttnProcessor(
                    hidden_size, scale=scale,
                )
                for i in range(mip_depth)
            ]
        )
        self.img_linear=nn.Sequential(
                nn.Linear(c_dim, projector_dim),
                nn.SiLU(),
                nn.Linear(projector_dim, projector_dim),
                nn.SiLU(),
                nn.Linear(projector_dim, hidden_size),
            )